-SAIF MERCHANT
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact
import plotly.express as px
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE
import plotly.express as px
from sklearn.pipeline import Pipeline
import warnings
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import DataConversionWarning
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
Q1 A - Import ‘signal-data.csv’ as DataFrame.
signal = pd.read_csv("C:/Users/SAIF MERCHANT/Desktop/Great Learning/Featurization, Model Selection & Tuning/Project/signal-data.csv")
signal
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2008-10-16 15:13:00 | 2899.41 | 2464.36 | 2179.7333 | 3085.3781 | 1.4843 | 100.0 | 82.2467 | 0.1248 | 1.3424 | ... | 203.1720 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1563 | 2008-10-16 20:49:00 | 3052.31 | 2522.55 | 2198.5667 | 1124.6595 | 0.8763 | 100.0 | 98.4689 | 0.1205 | 1.4333 | ... | NaN | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1564 | 2008-10-17 05:26:00 | 2978.81 | 2379.78 | 2206.3000 | 1110.4967 | 0.8236 | 100.0 | 99.4122 | 0.1208 | NaN | ... | 43.5231 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.0197 | 0.0086 | 0.0025 | 43.5231 | -1 |
| 1565 | 2008-10-17 06:01:00 | 2894.92 | 2532.01 | 2177.0333 | 1183.7287 | 1.5726 | 100.0 | 98.7978 | 0.1213 | 1.4622 | ... | 93.4941 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.0262 | 0.0245 | 0.0075 | 93.4941 | -1 |
| 1566 | 2008-10-17 06:07:00 | 2944.92 | 2450.76 | 2195.4444 | 2914.1792 | 1.5978 | 100.0 | 85.1011 | 0.1235 | NaN | ... | 137.7844 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.0117 | 0.0162 | 0.0045 | 137.7844 | -1 |
1567 rows × 592 columns
Q1 B - Print 5 point summary and share at least 2 observations.
signal.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1561.0 | 3014.452896 | 73.621787 | 2743.2400 | 2966.260000 | 3011.4900 | 3056.6500 | 3356.3500 |
| 1 | 1560.0 | 2495.850231 | 80.407705 | 2158.7500 | 2452.247500 | 2499.4050 | 2538.8225 | 2846.4400 |
| 2 | 1553.0 | 2200.547318 | 29.513152 | 2060.6600 | 2181.044400 | 2201.0667 | 2218.0555 | 2315.2667 |
| 3 | 1553.0 | 1396.376627 | 441.691640 | 0.0000 | 1081.875800 | 1285.2144 | 1591.2235 | 3715.0417 |
| 4 | 1553.0 | 4.197013 | 56.355540 | 0.6815 | 1.017700 | 1.3168 | 1.5257 | 1114.5366 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 586 | 1566.0 | 0.021458 | 0.012358 | -0.0169 | 0.013425 | 0.0205 | 0.0276 | 0.1028 |
| 587 | 1566.0 | 0.016475 | 0.008808 | 0.0032 | 0.010600 | 0.0148 | 0.0203 | 0.0799 |
| 588 | 1566.0 | 0.005283 | 0.002867 | 0.0010 | 0.003300 | 0.0046 | 0.0064 | 0.0286 |
| 589 | 1566.0 | 99.670066 | 93.891919 | 0.0000 | 44.368600 | 71.9005 | 114.7497 | 737.3048 |
| Pass/Fail | 1567.0 | -0.867262 | 0.498010 | -1.0000 | -1.000000 | -1.0000 | -1.0000 | 1.0000 |
591 rows × 8 columns
Insights
Creating a copy for Backup
signal_copy = signal.copy()
Q2 A - Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.
signal_copy['Time'] = pd.to_numeric(signal_copy['Time'], errors='coerce')
non_numeric_count = signal_copy['Time'].isnull().sum()
if non_numeric_count > 0:
print(f"Warning: {non_numeric_count} non-numeric values found in the 'Time' column.")
Warning: 1567 non-numeric values found in the 'Time' column.
threshold = 0.2 * len(signal_copy)
columns_to_remove = []
mean_values = {}
for column in signal_copy.columns:
if signal_copy[column].isnull().sum() >= threshold:
columns_to_remove.append(column)
else:
try:
mean = signal_copy[column].mean()
mean_values[column] = mean
except (TypeError, ValueError):
print(f"Skipping column '{column}' due to non-numeric data.")
signal_copy.drop(columns=columns_to_remove, inplace=True)
for column, mean in mean_values.items():
signal_copy[column].fillna(mean, inplace=True)
print('Initial shape of dataframe : ',signal.shape)
print('Current shape of dataframe : ',signal_copy.shape)
Initial shape of dataframe : (1567, 592) Current shape of dataframe : (1567, 559)
Clearly, our operation has executed an we have managed to eliminate 33 Features
Q2 B - Identify and drop the features which are having same value for all the rows.
Q2 C - Drop other features if required using relevant functional knowledge. Clearly justify the same.
plt.style.use('default')
signal_copy.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 559 entries, 0 to Pass/Fail dtypes: float64(558), int64(1) memory usage: 6.7 MB
null_counts = signal_copy.isnull().sum()
print(null_counts[null_counts > 0])
Series([], dtype: int64)
constant_columns = [col for col in signal_copy.columns if signal_copy[col].nunique() == 1]
signal_copy.drop(columns=constant_columns, inplace=True)
signal_copy.shape
(1567, 443)
correlation_matrix = signal_copy.corr()
mask = (correlation_matrix.abs() >= 0.9) & (correlation_matrix.abs() < 1.0)
correlated_features = set()
for column in mask.columns:
correlated_features.update(mask.index[mask[column]].tolist())
signal_copy = signal_copy.drop(columns=correlated_features)
signal_copy.shape
(1567, 140)
Q2 D - Check for multi-collinearity in the data and take necessary action.
vif = pd.DataFrame()
temp_x = signal_copy
vif['Features'] = temp_x.columns
vif['VIF'] = [variance_inflation_factor(temp_x.values, i) for i in range(temp_x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif
high_vif_features = vif[vif["VIF"] > 10]["Features"]
features_to_drop = high_vif_features.tolist()
high_vif_features = vif[vif["VIF"] > 10]["Features"]
features_to_drop = high_vif_features.tolist()
signal_copy.drop(columns=features_to_drop, inplace=True)
high_vif_features
87 131
34 43
45 60
29 37
42 57
...
32 40
94 368
103 460
102 439
19 23
Name: Features, Length: 91, dtype: object
signal_copy.shape
(1567, 49)
Q2 E - Make all relevant modifications on the data using both functional/logical reasoning/assumptions.
Firstly, we made a copy of the dataset.Next, we removed features with 20% or more missing values and imputed the remaining missing values with the mean of the featureFurthermore, we Removed Features with Zero Standard Deviation ie Identifying and dropping the features which are having same values for all the rows;Next, we calculated the correlation matrix and got the opportunity to eliminate few of the highly correlated features as;Lastly, we checked for multi-collinearity in the data and took necessary action as;We could have executed PCA(Principal component Analysis) here itself, rather it is done in further steps in the project due to the project flow
Q3 A - Perform a detailed univariate Analysis with appropriate detailed comments after each analysis.
num_cols = signal_copy.select_dtypes(include=['float64', 'int64'])
num_features = num_cols.columns
n_cols = 4
def plot_histogram(selected_feature):
plt.figure(figsize=(12, 5))
sns.histplot(signal_copy[selected_feature], bins=20, kde=True, color='tan')
plt.title(selected_feature)
plt.tight_layout()
plt.tick_params(axis='x', labelsize= 7, colors='darkblue')
plt.tick_params(axis='y', labelsize= 7, colors='darkblue')
plt.grid(ls = 'dotted',c = 'dimgray')
plt.show()
@interact
def interactive_histogram(selected_feature=widgets.Dropdown(options=num_features, description='Select Feature')):
plot_histogram(selected_feature)
num_cols = signal_copy.select_dtypes(include=['float64', 'int64'])
num_features = num_cols.columns
def plot_boxplot(selected_feature):
flierprops = dict(marker='o', markeredgecolor='red', markersize=8, linestyle='none')
sns.boxplot(x=signal_copy[selected_feature], whis=1.5, flierprops=flierprops,palette = 'pastel')
plt.title(selected_feature)
plt.tick_params(axis='x', labelsize= 7, colors='darkblue')
plt.tick_params(axis='y', labelsize= 7, colors='darkblue')
plt.grid(ls = 'dotted',c = 'dimgray')
plt.tight_layout()
plt.figure(figsize=(8, 5))
@interact
def interactive_boxplot(selected_feature=widgets.Dropdown(options=num_features, description='Select Feature')):
plt.clf()
plot_boxplot(selected_feature)
plt.show()
<Figure size 800x500 with 0 Axes>
cols = signal_copy.select_dtypes(include=[float, int]).columns
skew_scores = signal_copy[cols].skew()
print("Skewness Scores:")
skew_scores
Skewness Scores:
9 0.331433 10 0.057724 24 -0.054125 41 12.307135 59 4.730023 75 0.388149 76 -0.195524 77 0.594708 78 0.176220 79 1.005622 80 -0.185203 81 -0.685258 82 0.234897 91 -0.138299 95 0.127172 100 -0.075279 102 -0.206321 107 -0.280079 108 0.413221 129 -0.979244 367 5.456667 412 1.816378 418 0.456661 419 0.499839 423 2.084869 432 3.346368 433 1.364392 438 10.164666 468 1.262658 476 5.458772 482 0.469546 483 1.714218 484 1.534042 485 1.524456 486 0.615203 487 1.160917 488 0.356705 489 1.047157 496 3.280515 499 0.743494 500 0.920019 510 4.109312 511 0.700040 521 9.040238 555 1.924065 560 2.171562 586 1.438483 589 2.715340 Pass/Fail 3.487359 dtype: float64
Features with negative skewness values (such as 24, 76, 81, 129, 496, and 511) are negatively skewed. This indicates that these traits are concentrated on the higher end and have a lengthy left tail, with a few very low values.
"Pass/Fail" Column: Positive skewness is also present in the "Pass/Fail" column (skewness = 3.487359). This shows that the "Pass/Fail" labels may be unbalanced, with more records falling into the "Pass" group than the "Fail" category.
Features with significant positive skewness frequently point to outliers or extreme values. These outliers must be handled correctly during data preprocessing or modelling since they have a major impact on model performance.
Transformation: To reduce skewness in highly skewed features and make the data more symmetric and appropriate for specific statistical analyses or modelling techniques, you may want to consider performing transformations like logarithmic, square root, or Box-Cox transformations.
Q3 B - Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis.
fig = px.scatter(signal_copy, x='108', y='91', title=f"Scatter Plot of Column 108 vs. Column 91",
labels={f'108': f'Column 108', f'91': f'Column 91'})
fig.update_traces(marker=dict(size=10, opacity=0.5),
selector=dict(mode='markers'))
fig.show()
Insights
The scatter plot illustrates the connection between columns 108 and 91. Given the high positive connection between the two columns, it can be said that when column 108's value rises, so too does column 91's. The link is very high, with a correlation coefficient of 0.9.
The scatter plot contains a few outliers. These data points are very remote from the other data points. A number of things, including measurement or data entry errors, can lead to outliers. Outliers must be looked into to see if they are real data points or if they need to be excluded from the analysis.
Overall, the scatter plot reveals that column 108 and column 91 have a significant positive connection. As a result, it is likely that both columns will travel in the same direction. Making predictions about one column depending on the value of the other column may be possible with the use of this information.
There is no discernible trend because the data points are dispersed equally across the display. There are certain data points that do not correspond to the trend since the correlation is not perfect. The distribution of the outliers suggests that they may not all be due to the same causes. I hope this is useful. If you have any further inquiries, please let me know.
fig = px.scatter(signal_copy, x='82', y='9', title=f"Scatter Plot of Column 82 vs. Column 9",
labels={f'82': f'Column 81', f'9': f'Column 91'})
fig.update_traces(marker=dict(size=10, opacity=0.5),
selector=dict(mode='markers'))
fig.show()
The association between column 82 and column 9 is depicted by the scatter plot. The values of the two columns have a weakly positive association, which means that if column 82's value rises, column 9's value also tends to rise, though not as sharply as in the previous scatter plot. A weak association may be seen by the correlation coefficient, which is 0.3.
The scatter plot has a few outliers. These data points are very remote from the other data points. A number of things, including measurement or data entry errors, can lead to outliers. Outliers must be looked into to see if they are real data points or if they need to be excluded from the analysis.
Overall, the scatter plot reveals that column 82 and column 9 have a weakly positive connection. This indicates that there is some chance that both columns will move in the same direction. With less certainty than in the preceding scatter plot, this information can be useful for making predictions about one column based on the value of the other column.
There is a distinct trend since the data points are not scattered equally over the plot. There are certain data points that do not correspond to the trend since the correlation is not perfect. The distribution of the outliers suggests that they may not all be due to the same causes.
import plotly.graph_objects as go
corr_mat = signal_copy.corr()
custom_color_scale = [
[0.0, 'rgb(240, 240, 240)'],
[0.2, 'rgb(219, 235, 255)'],
[0.4, 'rgb(184, 211, 255)'],
[0.6, 'rgb(153, 186, 255)'],
[0.8, 'rgb(120, 160, 255)'],
[1.0, 'rgb(87, 135, 255)']
]
heatmap = go.Heatmap(
x=corr_mat.columns,
y=corr_mat.columns,
z=corr_mat.values,
colorscale=custom_color_scale,
colorbar=dict(title="Correlation"),
)
fig = go.Figure(data=[heatmap])
fig.update_layout(
title="Customized Correlation Heatmap",
xaxis_title="Features",
yaxis_title="Features",
width=900,
height=900,
)
fig.show()
Insights
The relationship between the independent features and the Pass/Fail target variable is displayed on the heatmap. The correlation is stronger the darker the colour.
The target variable, Pass/Fail, and the variables Features_586 and Features_560 have a substantial positive correlation. This indicates that pupils who score higher for these characteristics have a higher chance of passing the test.
The aim variable, Pass/Fail, and the feature Features_555 have a somewhat negative connection. Higher scores for this trait indicate that pupils are less likely to succeed on the test.
The target variable, Pass/Fail, and the feature Features_521 do not correlate. This indicates that the importance of this trait has no bearing on the likelihood of the student passing the test.
The heatmap reveals that the goal variable, Pass/Fail, is substantially associated with a small number of attributes. The likelihood that a student will pass the test can be predicted using these characteristics.
Between -1 and 1, the correlation coefficient can lie. Perfect positive correlation is represented by a value of 1, perfect negative correlation by a value of 1, and no correlation by a value of 0.
Q4 A - Segregate predictors vs target attributes.
x = signal_copy.drop(['Pass/Fail'], axis=1)
y = signal_copy[['Pass/Fail']]
Q4 B - Check for target balancing and fix it if found imbalanced.
replaceStruct = {
"Pass/Fail": {-1 : 0, 1 : 1 }
}
y = y.replace(replaceStruct)
y.value_counts()
Pass/Fail 0 1463 1 104 dtype: int64
Clearly there is a need for upscaling for the class 1
smote_up = SMOTE(sampling_strategy = 1, k_neighbors = 5,random_state = 1)
x_smote,y_smote = smote_up.fit_resample(x,y)
Q4 C - Perform train-test split and standardize the data or vice versa if required.
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size=0.3, random_state=1)
pipeline = Pipeline([
('scaler', StandardScaler()),
])
x_in_whole = pipeline.fit_transform(x_smote)
x_train_preprocessed = pipeline.fit_transform(x_train)
x_test_preprocessed = pipeline.transform(x_test)
Q4 D - Check if the train and test data have similar statistical characteristics when compared with original data.
original_data_summary = signal_copy.describe()
train_data_summary = pd.DataFrame(x_train_preprocessed, columns=x_smote.columns).describe()
test_data_summary = pd.DataFrame(x_test_preprocessed, columns=x_smote.columns).describe()
print("5-Point Summary for Original Data:")
original_data_summary
5-Point Summary for Original Data:
| 9 | 10 | 24 | 41 | 59 | 75 | 76 | 77 | 78 | 79 | ... | 499 | 500 | 510 | 511 | 521 | 555 | 560 | 586 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | ... | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 | 1567.000000 |
| mean | -0.000841 | 0.000146 | -298.598136 | 3.353066 | 2.960241 | -0.006903 | -0.029390 | -0.007041 | -0.013643 | 0.003458 | ... | 263.195864 | 240.981377 | 55.763508 | 275.979457 | 11.610080 | 57.746537 | 0.072443 | 0.021458 | 99.670066 | -0.867262 |
| std | 0.015107 | 0.009296 | 2900.835956 | 2.342268 | 9.510891 | 0.022121 | 0.032948 | 0.031127 | 0.047504 | 0.022902 | ... | 324.563886 | 322.797084 | 37.667659 | 329.454099 | 103.122996 | 32.152263 | 0.051561 | 0.012354 | 93.861936 | 0.498010 |
| min | -0.053400 | -0.034900 | -14804.500000 | -0.075900 | -28.988200 | -0.104900 | -0.186200 | -0.104600 | -0.348200 | -0.056800 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.826900 | 0.017700 | -0.016900 | 0.000000 | -1.000000 |
| 25% | -0.010800 | -0.005600 | -1474.375000 | 2.699000 | -1.855450 | -0.019200 | -0.051350 | -0.029400 | -0.047300 | -0.010700 | ... | 0.000000 | 0.000000 | 35.324400 | 0.000000 | 0.000000 | 34.147100 | 0.036200 | 0.013450 | 44.368600 | -1.000000 |
| 50% | -0.001300 | 0.000400 | -80.500000 | 3.080000 | 0.973600 | -0.006600 | -0.029390 | -0.009400 | -0.013643 | 0.000800 | ... | 0.000000 | 0.000000 | 47.058800 | 0.000000 | 0.000000 | 57.746537 | 0.059200 | 0.020500 | 72.023000 | -1.000000 |
| 75% | 0.008400 | 0.005900 | 1376.250000 | 3.515000 | 4.337700 | 0.006600 | -0.006900 | 0.008900 | 0.012050 | 0.012800 | ... | 536.122600 | 505.225750 | 64.228450 | 554.010700 | 0.000000 | 69.630650 | 0.089000 | 0.027600 | 114.749700 | -1.000000 |
| max | 0.074900 | 0.053000 | 14106.000000 | 37.880000 | 168.145500 | 0.231500 | 0.072300 | 0.133100 | 0.249200 | 0.101300 | ... | 1000.000000 | 999.233700 | 451.485100 | 1000.000000 | 1000.000000 | 303.550000 | 0.445700 | 0.102800 | 737.304800 | 1.000000 |
8 rows × 49 columns
print("\n5-Point Summary for Training Data:")
train_data_summary
5-Point Summary for Training Data:
| 9 | 10 | 24 | 41 | 59 | 75 | 76 | 77 | 78 | 79 | ... | 496 | 499 | 500 | 510 | 511 | 521 | 555 | 560 | 586 | 589 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | ... | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 | 2.048000e+03 |
| mean | 9.432559e-18 | -5.312591e-18 | -1.095044e-17 | 8.971773e-17 | 2.276825e-17 | 5.258381e-18 | -2.940898e-17 | -3.198396e-18 | 2.592598e-17 | 3.079134e-17 | ... | 1.176766e-16 | -2.509928e-16 | -3.805550e-16 | -1.615461e-17 | 1.034600e-16 | 1.283966e-16 | -1.069972e-16 | 4.320546e-17 | 1.878651e-16 | -9.649399e-18 |
| std | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | ... | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 | 1.000244e+00 |
| min | -2.904904e+00 | -3.731015e+00 | -5.633662e+00 | -1.611235e+00 | -3.796971e+00 | -3.973806e+00 | -4.868547e+00 | -3.325080e+00 | -2.750481e+00 | -2.211578e+00 | ... | -1.133406e+00 | -7.873518e-01 | -7.677691e-01 | -1.289299e+00 | -9.708956e-01 | -1.312196e-01 | -1.704655e+00 | -1.192809e+00 | -2.440078e+00 | -1.225707e+00 |
| 25% | -6.342354e-01 | -6.273500e-01 | -4.471609e-01 | -3.426493e-01 | -6.751616e-01 | -5.235057e-01 | -6.218777e-01 | -6.850805e-01 | -7.868634e-01 | -7.013820e-01 | ... | -5.633589e-01 | -7.873518e-01 | -7.677691e-01 | -6.242206e-01 | -9.708956e-01 | -1.312196e-01 | -5.469288e-01 | -7.318046e-01 | -6.118039e-01 | -6.193576e-01 |
| 50% | -3.379751e-02 | 4.620126e-02 | 1.110436e-01 | -1.301573e-01 | -2.866028e-01 | -5.991937e-03 | 5.677833e-04 | -5.181965e-02 | -8.981676e-02 | -1.546239e-01 | ... | -2.886362e-01 | -7.873518e-01 | -7.677691e-01 | -2.588417e-01 | -1.994868e-01 | -1.312196e-01 | -7.347831e-02 | -2.624185e-01 | -5.804792e-02 | -2.936074e-01 |
| 75% | 6.094050e-01 | 6.738693e-01 | 6.395955e-01 | 1.821923e-01 | 5.828571e-01 | 4.711263e-01 | 6.122519e-01 | 5.302953e-01 | 5.648049e-01 | 5.365491e-01 | ... | 1.260515e-01 | 8.358803e-01 | 7.279894e-01 | 2.691078e-01 | 9.330603e-01 | -1.312196e-01 | 3.496466e-01 | 4.647108e-01 | 5.044978e-01 | 2.640988e-01 |
| max | 5.738246e+00 | 4.058793e+00 | 5.640487e+00 | 1.644450e+01 | 5.222279e+00 | 9.646746e+00 | 3.269144e+00 | 4.719007e+00 | 5.975961e+00 | 4.063673e+00 | ... | 8.848695e+00 | 2.490497e+00 | 2.500737e+00 | 9.081032e+00 | 2.101195e+00 | 9.399589e+00 | 7.959662e+00 | 7.775818e+00 | 7.123200e+00 | 7.360620e+00 |
8 rows × 48 columns
print("\n5-Point Summary for Testing Data:")
test_data_summary
5-Point Summary for Testing Data:
| 9 | 10 | 24 | 41 | 59 | 75 | 76 | 77 | 78 | 79 | ... | 496 | 499 | 500 | 510 | 511 | 521 | 555 | 560 | 586 | 589 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | ... | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 | 878.000000 |
| mean | 0.018976 | 0.027381 | -0.011043 | -0.034084 | -0.044994 | 0.065161 | 0.034007 | -0.062433 | -0.078455 | -0.013626 | ... | 0.021997 | 0.063223 | 0.019492 | -0.011945 | -0.060982 | -0.031992 | -0.053374 | 0.010913 | -0.028464 | -0.048220 |
| std | 0.987502 | 1.030691 | 1.015703 | 0.450221 | 1.149969 | 1.021465 | 0.956893 | 0.964689 | 1.017116 | 0.968820 | ... | 0.976154 | 1.014271 | 1.012952 | 1.031786 | 0.989831 | 0.879218 | 0.995220 | 0.988126 | 0.961445 | 1.006755 |
| min | -3.829625 | -3.955182 | -5.020633 | -1.659792 | -2.723825 | -3.305735 | -3.143205 | -2.559462 | -7.415193 | -2.778515 | ... | -1.001268 | -0.787352 | -0.767769 | -1.567657 | -0.970896 | -0.131220 | -1.802953 | -1.192809 | -3.398163 | -1.225707 |
| 25% | -0.599879 | -0.603115 | -0.447580 | -0.324647 | -0.716183 | -0.424931 | -0.574276 | -0.747271 | -0.828466 | -0.638471 | ... | -0.535566 | -0.787352 | -0.767769 | -0.631136 | -0.970896 | -0.131220 | -0.625223 | -0.730178 | -0.625708 | -0.655670 |
| 50% | 0.003513 | 0.046201 | 0.113091 | -0.122764 | -0.328694 | -0.005992 | 0.050932 | -0.099525 | -0.131431 | -0.169777 | ... | -0.269651 | -0.544023 | -0.767769 | -0.262510 | -0.395794 | -0.131220 | -0.073478 | -0.250639 | -0.068993 | -0.296909 |
| 75% | 0.590806 | 0.662162 | 0.667995 | 0.192960 | 0.428558 | 0.514850 | 0.632788 | 0.427960 | 0.540764 | 0.498050 | ... | 0.161262 | 0.924442 | 0.833331 | 0.214022 | 0.840041 | -0.131220 | 0.278682 | 0.440777 | 0.506294 | 0.203893 |
| max | 4.694207 | 5.896964 | 3.845598 | 1.357550 | 17.710203 | 9.408401 | 3.397045 | 3.236757 | 3.380220 | 3.652536 | ... | 7.690617 | 2.501279 | 2.504466 | 8.791647 | 2.065052 | 9.399589 | 7.831842 | 6.807710 | 7.123200 | 7.730893 |
8 rows × 48 columns
Insights/Observations :
Size of Datasets:
Standardization:
Data Splitting:
Data Consistency:
Target Variable (Pass/Fail):
Model Evaluation:
Data Transformation:
Target Variable (Pass/Fail):
Q5 A - Use any Supervised Learning technique to train a model.
svm_base_model = SVC()
svm_base_model.fit(x_train_preprocessed,y_train)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
SVC()
y_pred_base_svm = svm_base_model.predict(x_test_preprocessed)
Performance Metrics
print('Score for Training Data : ',round(svm_base_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(svm_base_model.score(x_test_preprocessed,y_test),3))
Score for Training Data : 0.994 Score for Testing Data : 0.967
print(accuracy_score(y_test,y_pred_base_svm))
0.9669703872437357
print(classification_report(y_test,y_pred_base_svm))
precision recall f1-score support
0 1.00 0.94 0.97 444
1 0.94 1.00 0.97 434
accuracy 0.97 878
macro avg 0.97 0.97 0.97 878
weighted avg 0.97 0.97 0.97 878
def con_matrix(actual, predictions):
from sklearn.metrics import confusion_matrix
matrix = pd.DataFrame(confusion_matrix(actual, predictions), columns = np.unique(actual), index = np.unique(actual))
matrix.loc[:,'total_count_actual_class'] = matrix.sum(axis = 1)
matrix.loc['total_count_predicted'] = matrix.sum(axis = 0)
return matrix
con_matrix(y_test, y_pred_base_svm)
| 0 | 1 | total_count_actual_class | |
|---|---|---|---|
| 0 | 416 | 28 | 444 |
| 1 | 1 | 433 | 434 |
| total_count_predicted | 417 | 461 | 878 |
Q5 B - Use cross validation techniques.
1 : K-Fold C.V
warnings.filterwarnings("ignore", category=DataConversionWarning)
k_fold = KFold(n_splits=50, shuffle=True, random_state = 42)
results = cross_val_score(svm_base_model, x_train_preprocessed, y_train, cv = k_fold)
warnings.resetwarnings()
print(results)
print('----------------------------')
print('Mean : ')
print(np.mean(abs(results)))
print('----------------------------')
print('Std Dev : ')
print(results.std())
[1. 0.95121951 0.97560976 1. 0.97560976 1. 0.97560976 1. 0.92682927 1. 0.97560976 1. 0.95121951 1. 1. 0.90243902 0.92682927 0.92682927 1. 0.97560976 0.92682927 1. 1. 1. 1. 1. 0.97560976 1. 0.95121951 0.97560976 0.97560976 0.95121951 1. 0.92682927 1. 0.95121951 1. 1. 0.95121951 1. 0.95121951 1. 0.92682927 0.95121951 0.92682927 1. 1. 0.95121951 1. 1. ] ---------------------------- Mean : 0.9751219512195121 ---------------------------- Std Dev : 0.029060236114354102
2 : Stratified Cross C.V
warnings.filterwarnings("ignore", category=DataConversionWarning)
score = cross_val_score(svm_base_model, x_train_preprocessed, y_train, cv = StratifiedKFold())
warnings.resetwarnings()
print(score)
print('----------------------------')
print('Mean : ')
print(np.mean(abs(score)))
print('----------------------------')
print('Std Dev : ')
print(score.std())
[0.96585366 0.96829268 0.96829268 0.97555012 0.97310513] ---------------------------- Mean : 0.970218856222792 ---------------------------- Std Dev : 0.0035560719217545816
3 : Bootstrapping
warnings.filterwarnings("ignore")
SCM_temp = SVC()
SCM_temp_model = SCM_temp.fit(x_train_preprocessed, y_train)
y_pred = SCM_temp.predict(x_test_preprocessed)
accuracy = []
bootstrap_iteration = 10
for i in range(bootstrap_iteration):
x_, y_ = resample(x_train_preprocessed, y_train)
SCM_temp_model.fit(x_, y_)
y_pred = SCM_temp_model.predict(x_test)
acc = accuracy_score(y_pred, y_test)
accuracy.append(acc)
accuracy = np.array(accuracy)
print(accuracy)
print('--------------------------')
print('Average: ', accuracy.mean())
print('--------------------------')
print('Standard deviation: ', accuracy.std())
[0.50569476 0.50569476 0.50569476 0.50569476 0.50569476 0.50569476 0.50569476 0.50569476 0.50569476 0.50569476] -------------------------- Average: 0.5056947608200455 -------------------------- Standard deviation: 1.1102230246251565e-16
Comparing the Best C.V Technique for our Model
warnings.filterwarnings("ignore", category=DataConversionWarning)
model1_scores = cross_val_score(svm_base_model, x_train_preprocessed, y_train, cv=k_fold)
model2_scores = cross_val_score(svm_base_model, x_train_preprocessed, y_train, cv=StratifiedKFold())
warnings.resetwarnings()
model3_scores = accuracy
model1_mean = np.mean(model1_scores)
model2_mean = np.mean(model2_scores)
model3_mean = np.mean(model3_scores)
model1_std = np.std(model1_scores)
model2_std = np.std(model2_scores)
model3_std = np.std(model3_scores)
models_data = [
("K-Fold", model1_mean, model1_std),
("Stratified Sampling", model2_mean, model2_std),
("Bootstrap", model3_mean, model3_std)
]
sorted_models = sorted(models_data, key=lambda x: x[1], reverse=True)
print("Models sorted by mean (descending order):")
for model_name, mean, std in sorted_models:
print(f"{model_name}: Mean = {mean:.2f}, Std Dev = {std:.2f}")
Models sorted by mean (descending order): K-Fold: Mean = 0.98, Std Dev = 0.03 Stratified Sampling: Mean = 0.97, Std Dev = 0.00 Bootstrap: Mean = 0.51, Std Dev = 0.00
Looks like K-Fold Cross Validation has given great results for our model
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
warnings.filterwarnings("ignore", category=DataConversionWarning)
accuracy_scores = cross_val_score(svm_base_model, x_train_preprocessed, y_train, cv=k_fold, scoring=make_scorer(accuracy_score))
y_pred = cross_val_predict(svm_base_model, x_train_preprocessed, y_train, cv=k_fold)
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
conf_matrix = confusion_matrix(y_train, y_pred)
warnings.resetwarnings()
print("Accuracy Scores: ", accuracy_scores)
print('--------------------------')
print("Mean Accuracy: {:.2f}".format(mean_accuracy))
print('--------------------------')
print("Standard Deviation of Accuracy: {:.2f}".format(std_accuracy))
print('--------------------------')
print("Confusion Matrix:\n", conf_matrix)
Accuracy Scores: [1. 0.95121951 0.97560976 1. 0.97560976 1. 0.97560976 1. 0.92682927 1. 0.97560976 1. 0.95121951 1. 1. 0.90243902 0.92682927 0.92682927 1. 0.97560976 0.92682927 1. 1. 1. 1. 1. 0.97560976 1. 0.95121951 0.97560976 0.97560976 0.95121951 1. 0.92682927 1. 0.95121951 1. 1. 0.95121951 1. 0.95121951 1. 0.92682927 0.95121951 0.92682927 1. 1. 0.95121951 1. 1. ] -------------------------- Mean Accuracy: 0.98 -------------------------- Standard Deviation of Accuracy: 0.03 -------------------------- Confusion Matrix: [[ 971 48] [ 3 1026]]
Q5 C - Apply hyper-parameter tuning techniques to get the best accuracy.
As our data is huge, using GridSearch CV could result in rigorous outputs/time consumption. Hence we move ahead with RandomSearch CV
parameter_grid = [
{'C' : [0.5,1,10,100],
'gamma' : ['scale',1,0.1,0.01,0.001],
'kernel' : ['rbf','linear','poly'],
'degree' : [1,2,3,4,5]}
]
samples = 10
warnings.filterwarnings('ignore', category=DataConversionWarning)
randomCV = RandomizedSearchCV(svm_base_model, param_distributions=parameter_grid,
n_iter=samples, cv=5)
randomCV.fit(x_train_preprocessed, y_train)
warnings.resetwarnings()
print("Best cross-validation accuracy: {:.2f}".format(randomCV.best_score_))
print('----------------------')
print("Best parameters: ", randomCV.best_params_)
print('----------------------')
print("Validation set accuracy: {:.2f}".format(randomCV.score(x_test_preprocessed, y_test)))
Best cross-validation accuracy: 0.99
----------------------
Best parameters: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 1, 'C': 1}
----------------------
Validation set accuracy: 0.99
best_score = randomCV.best_score_
best_params = randomCV.best_params_
print("Best Score:", best_score)
print("Best Parameters:", best_params)
Best Score: 0.9902331683463533
Best Parameters: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 1, 'C': 1}
print("Best Parameters:", best_params)
Best Parameters: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 1, 'C': 1}
Performance Metrics
y_pred_tuning_svm = randomCV.predict(x_test_preprocessed)
print('Score for Training Data : ',round(randomCV.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(randomCV.score(x_test_preprocessed,y_test),3))
Score for Training Data : 1.0 Score for Testing Data : 0.994
print(accuracy_score(y_test,y_pred_tuning_svm))
0.9943052391799544
print(classification_report(y_test,y_pred_tuning_svm))
precision recall f1-score support
0 0.99 1.00 0.99 444
1 1.00 0.99 0.99 434
accuracy 0.99 878
macro avg 0.99 0.99 0.99 878
weighted avg 0.99 0.99 0.99 878
def con_matrix(actual, predictions):
from sklearn.metrics import confusion_matrix
matrix = pd.DataFrame(confusion_matrix(actual, predictions), columns = np.unique(actual), index = np.unique(actual))
matrix.loc[:,'total_count_actual_class'] = matrix.sum(axis = 1)
matrix.loc['total_count_predicted'] = matrix.sum(axis = 0)
return matrix
con_matrix(y_test, y_pred_tuning_svm)
| 0 | 1 | total_count_actual_class | |
|---|---|---|---|
| 0 | 444 | 0 | 444 |
| 1 | 5 | 429 | 434 |
| total_count_predicted | 449 | 429 | 878 |
Q5 D - Use any other technique/method which can enhance the model performance.
pca = PCA(n_components = 48)
pca.fit(x_in_whole)
cov_matrix = np.cov(x_in_whole.T)
e_vals, e_vecs = np.linalg.eig(cov_matrix)
tot = sum(e_vals)
var_exp = [(i / tot) * 100 for i in sorted(e_vals, reverse=True)]
plt.figure(figsize=(10, 5))
plt.bar(range(1, e_vals.size + 1), var_exp, label='Individual explained variance', align='center',
color={'brown'}, edgecolor='forestgreen')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc='best')
plt.xlim(0, 50)
plt.tick_params(axis='x', labelsize=13, colors='royalblue')
plt.tick_params(axis='y', labelsize=13, colors='royalblue')
plt.title('Number of Components vs. Cumulative Explained Variance')
plt.grid()
plt.tight_layout()
plt.show()
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
cumulative_var = np.cumsum(per_var)
n_components = len(per_var)
plt.figure(figsize=(10 , 5))
plt.bar(range(1, n_components + 1), cumulative_var, align='center',
color = {'tan'},edgecolor = 'forestgreen',alpha=0.7);
plt.plot(range(1, n_components + 1), cumulative_var, marker='o', linestyle='-', color='b');
plt.ylabel('Cumulative Explained Variance (%)');
plt.xlabel('Number of Components');
plt.tick_params(axis='x', labelsize=13, colors='royalblue');
plt.tick_params(axis='y', labelsize=13, colors='royalblue');
plt.title('Number of Components vs. Cumulative Explained Variance');
plt.axhline(y=95, color='r', linestyle='-.', label='95% Variance');
plt.legend();
optimal_n_components_manual = np.argmax(np.cumsum(var_exp) >= 95) + 1
print(f"Optimal n_components after PCA: {optimal_n_components_manual}")
Optimal n_components after PCA: 42
best_params = randomCV.best_params_
print("Best Parameters:", best_params)
Best Parameters: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 1, 'C': 1}
pipe_svc = Pipeline([('scl', StandardScaler()),
('pca', PCA(n_components = 42)),
('svc', SVC(kernel = 'rbf',gamma = 0.1, C = 1, degree = 1))])
x_train_pca = pipe_svc.fit(x_train_preprocessed, y_train)
x_test_pca = pipe_svc.fit(x_test_preprocessed, y_test)
x_train_pca_score = pipe_svc.score(x_train_preprocessed, y_train)
x_test_pca_score = pipe_svc.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score)
print('Test Accuracy: ' ,x_test_pca_score)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Train Accuracy: 0.96923828125 Test Accuracy: 1.0
Q5 E - Display and explain the classification report in detail.
#Model 1
y_pred_train1 = svm_base_model.predict(x_train_preprocessed)
y_pred_test1 = svm_base_model.predict(x_test_preprocessed)
#Model 2
accuracy_scores = cross_val_score(svm_base_model, x_train_preprocessed, y_train, cv=k_fold, scoring=make_scorer(accuracy_score))
y_pred_cv = cross_val_predict(svm_base_model, x_train_preprocessed, y_train, cv=k_fold)
train_accuracy_cv = accuracy_score(y_train, y_pred_cv)
y_pred_test = svm_base_model.predict(x_test_preprocessed)
test_accuracy_cv = accuracy_score(y_test, y_pred_test)
train_recall_cv = recall_score(y_train, y_pred_cv)
test_recall_cv = recall_score(y_test, y_pred_test)
train_precision_cv = precision_score(y_train, y_pred_cv)
test_precision_cv = precision_score(y_test, y_pred_test)
#Model 3
y_pred_train3 = randomCV.predict(x_train_preprocessed)
y_pred_test3 = randomCV.predict(x_test_preprocessed)
accuracy_train3 = accuracy_score(y_train, y_pred_train3)
accuracy_test3 = accuracy_score(y_test, y_pred_test3)
recall_train3 = recall_score(y_train, y_pred_train3)
recall_test3 = recall_score(y_test, y_pred_test3)
precision_train3 = precision_score(y_train, y_pred_train3)
precision_test3 = precision_score(y_test, y_pred_test3)
#Model 4(PCA)
pipe_svc.fit(x_train_preprocessed, y_train)
y_pred_train_pca = pipe_svc.predict(x_train_preprocessed)
y_pred_test_pca = pipe_svc.predict(x_test_preprocessed)
train_accuracy_pca = accuracy_score(y_train, y_pred_train_pca)
test_accuracy_pca = accuracy_score(y_test, y_pred_test_pca)
train_recall_pca = recall_score(y_train, y_pred_train_pca)
test_recall_pca = recall_score(y_test, y_pred_test_pca)
train_precision_pca = precision_score(y_train, y_pred_train_pca)
test_precision_pca = precision_score(y_test, y_pred_test_pca)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
model_metrics = [
{
'Model': 'Model 1 (SVM)',
'Train Accuracy': accuracy_score(y_train, y_pred_train1),
'Test Accuracy': accuracy_score(y_test, y_pred_test1),
'Train Recall': recall_score(y_train, y_pred_train1),
'Test Recall': recall_score(y_test, y_pred_test1),
'Train Precision': precision_score(y_train, y_pred_train1),
'Test Precision': precision_score(y_test, y_pred_test1),
},
{
'Model': 'Model 2 (SVM with K-Fold CV)',
'Train Accuracy': train_accuracy_cv,
'Test Accuracy': test_accuracy_cv,
'Train Recall': train_recall_cv,
'Test Recall': test_recall_cv,
'Train Precision': train_precision_cv,
'Test Precision': test_precision_cv,
},
{
'Model': 'Model 3 (SVM with Hyperparameters)',
'Train Accuracy': accuracy_train3,
'Test Accuracy': accuracy_test3,
'Train Recall': recall_train3,
'Test Recall': recall_test3,
'Train Precision': precision_train3,
'Test Precision': precision_test3,
},
{
'Model': 'Model 4 (PCA)',
'Train Accuracy': train_accuracy_pca,
'Test Accuracy': test_accuracy_pca,
'Train Recall': train_recall_pca,
'Test Recall': test_recall_pca,
'Train Precision': train_precision_pca,
'Test Precision': test_precision_pca,
}
]
results_df = pd.DataFrame(model_metrics)
results_df
| Model | Train Accuracy | Test Accuracy | Train Recall | Test Recall | Train Precision | Test Precision | |
|---|---|---|---|---|---|---|---|
| 0 | Model 1 (SVM) | 0.994141 | 0.966970 | 0.999028 | 0.997696 | 0.989413 | 0.939262 |
| 1 | Model 2 (SVM with K-Fold CV) | 0.975098 | 0.966970 | 0.997085 | 0.997696 | 0.955307 | 0.939262 |
| 2 | Model 3 (SVM with Hyperparameters) | 1.000000 | 0.994305 | 1.000000 | 0.988479 | 1.000000 | 1.000000 |
| 3 | Model 4 (PCA) | 1.000000 | 0.996583 | 1.000000 | 0.993088 | 1.000000 | 1.000000 |
sorted_results = results_df.sort_values(by='Test Accuracy', ascending=False)
best_model = sorted_results.iloc[0]
print("Best-Performing Model:")
best_model
Best-Performing Model:
Model Model 4 (PCA) Train Accuracy 1.0 Test Accuracy 0.996583 Train Recall 1.0 Test Recall 0.993088 Train Precision 1.0 Test Precision 1.0 Name: 3, dtype: object
Q5 F - Apply the above steps for all possible models that you have learnt so far.
Contents in this question
: Logistic Model
: kNN Model
: Decision Tree Classifier
: Ada-Boost Classifier
: Gradient-Boost Classifier
from sklearn.linear_model import LogisticRegression
logistic_model_base = LogisticRegression()
logistic_model_base.fit(x_train_preprocessed, y_train)
logistic_model_base_y_pred = logistic_model_base.predict(x_test_preprocessed)
print('Score for Training Data : ',round(logistic_model_base.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(logistic_model_base.score(x_test_preprocessed,y_test),3))
Score for Training Data : 0.766 Score for Testing Data : 0.761
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
parameter_grid = [
{
'C': uniform(0.001, 10),
'penalty': ['l1', 'l2'],
'solver': ['liblinear', 'saga']
}
]
samples = 10
warnings.filterwarnings('ignore', category=DataConversionWarning)
logistic_hyp_param_model = RandomizedSearchCV(logistic_model_base, param_distributions=parameter_grid,
n_iter=samples, cv=5)
logistic_hyp_param_model.fit(x_train_preprocessed, y_train)
warnings.resetwarnings()
print("Best parameters: ", logistic_hyp_param_model.best_params_)
Best parameters: {'C': 4.595217810677961, 'penalty': 'l2', 'solver': 'saga'}
logistic_hyp_param_model_y_pred = logistic_hyp_param_model.predict(x_test_preprocessed)
print('Score for Training Data : ',round(logistic_hyp_param_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(logistic_hyp_param_model.score(x_test_preprocessed,y_test),3))
from sklearn.decomposition import PCA
pipe_logistic_model_with_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('log_model', LogisticRegression(solver = 'saga',
penalty = 'l2', C = 4.595217810677961))
])
x_train_pca_log_model_with_hyp = pipe_logistic_model_with_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_log_model_with_hyp = pipe_logistic_model_with_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_log_model_with_hyp = pipe_logistic_model_with_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_log_model_with_hyp = pipe_logistic_model_with_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_log_model_with_hyp)
print('Test Accuracy: ' ,x_test_pca_score_log_model_with_hyp)
Train Accuracy: 0.73486328125 Test Accuracy: 0.7596810933940774
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
from sklearn.decomposition import PCA
pipe_logistic_model_without_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('log_model', LogisticRegression())
])
x_train_pca_log_model_without_hyp = pipe_logistic_model_without_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_log_model_without_hyp = pipe_logistic_model_without_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_log_model_without_hyp = pipe_logistic_model_without_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_log_model_without_hyp = pipe_logistic_model_without_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_log_model_without_hyp)
print('Test Accuracy: ' ,x_test_pca_score_log_model_without_hyp)
Train Accuracy: 0.734375 Test Accuracy: 0.7596810933940774
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
from sklearn.neighbors import KNeighborsClassifier
knn_base_model = KNeighborsClassifier(n_neighbors = 22)
knn_base_model.fit(x_train_preprocessed,y_train)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:198: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
KNeighborsClassifier(n_neighbors=22)
knn_base_model_y_pred = knn_base_model.predict(x_test_preprocessed)
print('Score for Training Data : ',round(knn_base_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(knn_base_model.score(x_test_preprocessed,y_test),3))
Score for Training Data : 0.688 Score for Testing Data : 0.671
parameter_grid = [
{
'n_neighbors': np.arange(1,100),
'weights': ['uniform', 'distance'],
'p': [2,4,6,8,10],
'leaf_size' : [10,20,30,40,50],
'algorithm' : ['auto','ball_tree','kd_tree'],
'metric' : ['minkowski','euclidean','manhattan']
}
]
samples = 10
warnings.filterwarnings('ignore', category=DataConversionWarning)
knn_hyp_param_model = RandomizedSearchCV(knn_base_model, param_distributions=parameter_grid,
n_iter=samples, cv=5)
knn_hyp_param_model.fit(x_train_preprocessed, y_train)
warnings.resetwarnings()
print("Best parameters: ", knn_hyp_param_model.best_params_)
Best parameters: {'weights': 'uniform', 'p': 4, 'n_neighbors': 22, 'metric': 'manhattan', 'leaf_size': 40, 'algorithm': 'auto'}
knn_hyp_param_model_y_pred = knn_hyp_param_model.predict(x_test_preprocessed)
print('Score for Training Data : ',round(knn_hyp_param_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(knn_hyp_param_model.score(x_test_preprocessed,y_test),3))
from sklearn.decomposition import PCA
pipe_knn_model_with_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('knn_model', KNeighborsClassifier(weights= 'uniform', p= 4,
n_neighbors= 22, metric= 'manhattan',
leaf_size= 40, algorithm= 'auto'))
])
x_train_pca_knn_model_with_hyp = pipe_knn_model_with_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_knn_model_with_hyp = pipe_knn_model_with_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_knn_model_with_hyp = pipe_knn_model_with_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_knn_model_with_hyp = pipe_knn_model_with_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_knn_model_with_hyp)
print('Test Accuracy: ' ,x_test_pca_score_knn_model_with_hyp)
Train Accuracy: 0.63330078125 Test Accuracy: 0.6321184510250569
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:198: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:198: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
from sklearn.decomposition import PCA
pipe_knn_model_without_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('knn_model', KNeighborsClassifier())
])
x_train_pca_knn_model_without_hyp = pipe_knn_model_without_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_knn_model_without_hyp = pipe_knn_model_without_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_knn_model_without_hyp = pipe_knn_model_without_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_knn_model_without_hyp = pipe_knn_model_without_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_knn_model_without_hyp)
print('Test Accuracy: ' ,x_test_pca_score_knn_model_without_hyp)
Train Accuracy: 0.71728515625 Test Accuracy: 0.7904328018223234
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:198: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:198: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
from sklearn.tree import DecisionTreeClassifier
decision_tree_model_base = DecisionTreeClassifier(random_state = 42)
decision_tree_model_base.fit(x_train_preprocessed,y_train)
DecisionTreeClassifier(random_state=42)
decision_tree_base_model_y_pred = decision_tree_model_base.predict(x_test_preprocessed)
print('Score for Training Data : ',round(decision_tree_model_base.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(decision_tree_model_base.score(x_test_preprocessed,y_test),3))
Score for Training Data : 1.0 Score for Testing Data : 0.878
parameter_grid = [
{
'ccp_alpha': np.linspace(0, 1, 100),
'class_weight': ['balanced', None],
'criterion': ['gini', 'entropy'],
'max_depth': [5, 10, 15, 20, 25, 30, None],
'max_features': [None],
'min_impurity_decrease': np.arange(0, 1),
'min_samples_leaf': [1, 2, 3, 4, 5],
'min_samples_split': [2, 3, 4, 5]
}
]
samples = 10
warnings.filterwarnings('ignore', category=DataConversionWarning)
decision_tree_hyp_param_model = RandomizedSearchCV(decision_tree_model_base,
param_distributions=parameter_grid,
n_iter=samples, cv=5)
decision_tree_hyp_param_model.fit(x_train_preprocessed, y_train)
warnings.resetwarnings()
print("Best parameters: ", decision_tree_hyp_param_model.best_params_)
Best parameters: {'min_samples_split': 5, 'min_samples_leaf': 1, 'min_impurity_decrease': 0, 'max_features': None, 'max_depth': 15, 'criterion': 'gini', 'class_weight': 'balanced', 'ccp_alpha': 0.08080808080808081}
decision_tree_hyp_param_model_y_pred = decision_tree_hyp_param_model.predict(x_test_preprocessed)
print('Score for Training Data : ',round(decision_tree_hyp_param_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(decision_tree_hyp_param_model.score(x_test_preprocessed,y_test),3))
from sklearn.decomposition import PCA
pipe_decision_tree_model_with_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('decision_tree_model', DecisionTreeClassifier(min_samples_split = 5, min_samples_leaf = 1,
min_impurity_decrease = 0, max_features = None,
max_depth = 15, criterion = 'gini',
class_weight = 'balanced',
ccp_alpha = 0.08080808080808081)
)])
x_train_pca_decision_tree_model_with_hyp = pipe_decision_tree_model_with_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_decision_tree_model_with_hyp = pipe_decision_tree_model_with_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_decision_tree_model_with_hyp = pipe_decision_tree_model_with_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_decision_tree_model_with_hyp = pipe_decision_tree_model_with_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_decision_tree_model_with_hyp)
print('Test Accuracy: ' ,x_test_pca_score_decision_tree_model_with_hyp)
Train Accuracy: 0.50244140625 Test Accuracy: 0.49430523917995445
from sklearn.decomposition import PCA
pipe_decision_tree_model_without_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('decision_tree_model', DecisionTreeClassifier()
)])
x_train_pca_decision_tree_model_without_hyp = pipe_decision_tree_model_without_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_decision_tree_model_without_hyp = pipe_decision_tree_model_without_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_decision_tree_model_without_hyp = pipe_decision_tree_model_without_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_decision_tree_model_without_hyp = pipe_decision_tree_model_without_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_decision_tree_model_without_hyp)
print('Test Accuracy: ' ,x_test_pca_score_decision_tree_model_without_hyp)
Train Accuracy: 0.77392578125 Test Accuracy: 1.0
from sklearn.ensemble import AdaBoostClassifier
adaboost_model_base = AdaBoostClassifier(random_state = 1)
adaboost_model_base.fit(x_train_preprocessed,y_train)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
AdaBoostClassifier(random_state=1)
adaboost_base_model_y_pred = adaboost_model_base.predict(x_test_preprocessed)
print('Score for Training Data : ',round(adaboost_model_base.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(adaboost_model_base.score(x_test_preprocessed,y_test),3))
Score for Training Data : 0.932 Score for Testing Data : 0.896
base_estimator = DecisionTreeClassifier(max_depth=1)
parameter_grid = [
{
'n_estimators': [50, 100, 150, 200, 250],
'learning_rate': [0.1, 0.5, 1.0, 1.5]
}
]
samples = 10
warnings.filterwarnings('ignore', category=DataConversionWarning)
adaboost_hyp_param_model = RandomizedSearchCV(
estimator=AdaBoostClassifier(base_estimator=base_estimator),
param_distributions=parameter_grid,
n_iter=samples,
cv=5
)
adaboost_hyp_param_model.fit(x_train_preprocessed, y_train)
warnings.resetwarnings()
print("Best parameters: ", adaboost_hyp_param_model.best_params_)
Best parameters: {'n_estimators': 200, 'learning_rate': 1.5}
adaboost_hyp_param_model_y_pred = adaboost_hyp_param_model.predict(x_test_preprocessed)
print('Score for Training Data : ',round(adaboost_hyp_param_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(adaboost_hyp_param_model.score(x_test_preprocessed,y_test),3))
from sklearn.decomposition import PCA
pipe_adaboost_model_with_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('adaboost_model', AdaBoostClassifier(n_estimators = 200, random_state = 1,
learning_rate = 1.5)
)])
x_train_pca_adaboost_model_with_hyp = pipe_adaboost_model_with_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_adaboost_model_with_hyp = pipe_adaboost_model_with_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_adaboost_model_with_hyp = pipe_adaboost_model_with_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_adaboost_model_with_hyp = pipe_adaboost_model_with_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_adaboost_model_with_hyp)
print('Test Accuracy: ' ,x_test_pca_score_adaboost_model_with_hyp)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Train Accuracy: 0.869140625 Test Accuracy: 1.0
from sklearn.decomposition import PCA
pipe_adaboost_model_without_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('adaboost_model', AdaBoostClassifier()
)])
x_train_pca_adaboost_model_without_hyp = pipe_adaboost_model_without_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_adaboost_model_without_hyp = pipe_adaboost_model_without_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_adaboost_model_without_hyp = pipe_adaboost_model_without_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_adaboost_model_without_hyp = pipe_adaboost_model_without_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_adaboost_model_without_hyp)
print('Test Accuracy: ' ,x_test_pca_score_adaboost_model_without_hyp)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\utils\validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Train Accuracy: 0.837890625 Test Accuracy: 0.928246013667426
from sklearn.ensemble import GradientBoostingClassifier
gradientboost_model_base = GradientBoostingClassifier(random_state = 1)
gradientboost_model_base.fit(x_train_preprocessed,y_train)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:494: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
GradientBoostingClassifier(random_state=1)
gradientboost_base_model_y_pred = gradientboost_model_base.predict(x_test_preprocessed)
print('Score for Training Data : ',round(gradientboost_model_base.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(gradientboost_model_base.score(x_test_preprocessed,y_test),3))
Score for Training Data : 0.989 Score for Testing Data : 0.953
parameter_grid = [
{
'n_estimators': [50, 100, 150, 200, 250],
'learning_rate': [0.1, 0.5, 1.0, 1.5],
'ccp_alpha': np.arange(0, 1000, 200),
'max_features' : np.arange(0.2, 0.7, 0.1),
'max_depth': [5, 10, 15, 20, 25, 30, None]
}
]
samples = 10
warnings.filterwarnings('ignore', category=DataConversionWarning)
gradientboost_hyp_param_model = RandomizedSearchCV(
estimator=GradientBoostingClassifier(),
param_distributions=parameter_grid,
n_iter=samples,
cv=5
)
gradientboost_hyp_param_model.fit(x_train_preprocessed, y_train)
warnings.resetwarnings()
print("Best parameters: ", gradientboost_hyp_param_model.best_params_)
Best parameters: {'n_estimators': 150, 'max_features': 0.5000000000000001, 'max_depth': 15, 'learning_rate': 1.0, 'ccp_alpha': 0}
gradientboost_hyp_param_model_y_pred = gradientboost_hyp_param_model.predict(x_test_preprocessed)
print('Score for Training Data : ',round(gradientboost_hyp_param_model.score(x_train_preprocessed,y_train),3))
print(' ')
print('Score for Testing Data : ',round(gradientboost_hyp_param_model.score(x_test_preprocessed,y_test),3))
from sklearn.decomposition import PCA
pipe_gradientboost_model_with_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('gradientboost_model', GradientBoostingClassifier(n_estimators = 150,
max_features = 0.5000000000000001,
max_depth = 15,
learning_rate = 1.0,
ccp_alpha = 0)
)])
x_train_pca_gradientboost_model_with_hyp = pipe_gradientboost_model_with_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_gradientboost_model_with_hyp = pipe_gradientboost_model_with_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_gradientboost_model_with_hyp = pipe_gradientboost_model_with_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_gradientboost_model_with_hyp = pipe_gradientboost_model_with_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_gradientboost_model_with_hyp)
print('Test Accuracy: ' ,x_test_pca_score_gradientboost_model_with_hyp)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:494: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:494: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Train Accuracy: 0.9033203125 Test Accuracy: 1.0
from sklearn.decomposition import PCA
pipe_gradientboost_model_without_hyp = Pipeline([
('pca', PCA(n_components = 42)),
('gradientboost_model', GradientBoostingClassifier()
)])
x_train_pca_gradientboost_model_without_hyp = pipe_gradientboost_model_without_hyp.fit(x_train_preprocessed, y_train)
x_test_pca_gradientboost_model_without_hyp = pipe_gradientboost_model_without_hyp.fit(x_test_preprocessed, y_test)
x_train_pca_score_gradientboost_model_without_hyp = pipe_gradientboost_model_without_hyp.score(x_train_preprocessed, y_train)
x_test_pca_score_gradientboost_model_without_hyp = pipe_gradientboost_model_without_hyp.score(x_test_preprocessed, y_test)
print('Train Accuracy: ' ,x_train_pca_score_gradientboost_model_without_hyp)
print('Test Accuracy: ' ,x_test_pca_score_gradientboost_model_without_hyp)
C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:494: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). C:\Users\SAIF MERCHANT\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:494: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
Train Accuracy: 0.88671875 Test Accuracy: 0.9954441913439636
Q6 A- Display and compare all the models designed with their train and test accuracies.
def metrics_score(model, flag=True):
score_list=[]
pred_train = model.predict(x_train_preprocessed)
pred_test = model.predict(x_test_preprocessed)
train_acc = model.score(x_train_preprocessed, y_train)
test_acc = model.score(x_test_preprocessed, y_test)
train_recall = metrics.recall_score(y_train, pred_train)
test_recall = metrics.recall_score(y_test, pred_test)
train_precision = metrics.precision_score(y_train, pred_train)
test_precision = metrics.precision_score(y_test, pred_test)
score_list.extend((train_acc, test_acc, train_recall, test_recall, train_precision, test_precision))
# If the flag is set to True then only the following print statements will be displayed. The default value is set to True.
if flag:
print("Accuracy on training set:", train_acc)
print("Accuracy on test set:", test_acc)
print("Recall on training set:", train_recall)
print("Recall on test set:", test_recall)
print("Precision on training set:", train_precision)
print("Precision on test set:", test_precision)
return score_list # returning the list with train and test scores
all_models = [logistic_model_base,logistic_hyp_param_model,
pipe_logistic_model_with_hyp,pipe_logistic_model_without_hyp,
knn_base_model,knn_hyp_param_model,
pipe_knn_model_with_hyp,pipe_knn_model_without_hyp,
decision_tree_model_base,decision_tree_hyp_param_model,
pipe_decision_tree_model_with_hyp,pipe_decision_tree_model_without_hyp,
adaboost_model_base,adaboost_hyp_param_model,
pipe_adaboost_model_with_hyp,pipe_adaboost_model_without_hyp,
gradientboost_model_base,gradientboost_hyp_param_model,
pipe_gradientboost_model_with_hyp,pipe_gradientboost_model_without_hyp]
acc_train = []
acc_test = []
recall_train = []
recall_test = []
precision_train = []
precision_test = []
for model in all_models:
j = metrics_score(model, False)
acc_train.append(np.round(j[0], 2))
acc_test.append(np.round(j[1], 2))
recall_train.append(np.round(j[2], 2))
recall_test.append(np.round(j[3], 2))
precision_train.append(np.round(j[4], 2))
precision_test.append(np.round(j[5], 2))
comparison_frame = pd.DataFrame({'Model':['logistic_model_base','logistic_hyp_param_model',
'pipe_logistic_model_with_hyp','pipe_logistic_model_without_hyp',
'knn_base_model','knn_hyp_param_model',
'pipe_knn_model_with_hyp','pipe_knn_model_without_hyp',
'decision_tree_model_base','decision_tree_hyp_param_model',
'pipe_decision_tree_model_with_hyp','pipe_decision_tree_model_without_hyp',
'adaboost_model_base','adaboost_hyp_param_model',
'pipe_adaboost_model_with_hyp','pipe_adaboost_model_without_hyp',
'gradientboost_model_base','gradientboost_hyp_param_model',
'pipe_gradientboost_model_with_hyp','pipe_gradientboost_model_without_hyp'],
'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
'Train_Recall':recall_train,'Test_Recall':recall_test,
'Train_Precision':precision_train,'Test_Precision':precision_test})
comparison_frame
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | Train_Precision | Test_Precision | |
|---|---|---|---|---|---|---|---|
| 0 | logistic_model_base | 0.77 | 0.76 | 0.79 | 0.78 | 0.75 | 0.75 |
| 1 | logistic_hyp_param_model | 0.77 | 0.76 | 0.79 | 0.78 | 0.75 | 0.75 |
| 2 | pipe_logistic_model_with_hyp | 0.73 | 0.76 | 0.75 | 0.79 | 0.73 | 0.74 |
| 3 | pipe_logistic_model_without_hyp | 0.73 | 0.76 | 0.75 | 0.79 | 0.73 | 0.74 |
| 4 | knn_base_model | 0.69 | 0.67 | 1.00 | 0.99 | 0.62 | 0.60 |
| 5 | knn_hyp_param_model | 0.77 | 0.76 | 0.99 | 0.99 | 0.69 | 0.67 |
| 6 | pipe_knn_model_with_hyp | 0.63 | 0.63 | 0.99 | 1.00 | 0.58 | 0.57 |
| 7 | pipe_knn_model_without_hyp | 0.72 | 0.79 | 1.00 | 1.00 | 0.64 | 0.70 |
| 8 | decision_tree_model_base | 1.00 | 0.88 | 1.00 | 0.89 | 1.00 | 0.87 |
| 9 | decision_tree_hyp_param_model | 0.71 | 0.72 | 0.62 | 0.61 | 0.76 | 0.77 |
| 10 | pipe_decision_tree_model_with_hyp | 0.50 | 0.49 | 1.00 | 1.00 | 0.50 | 0.49 |
| 11 | pipe_decision_tree_model_without_hyp | 0.77 | 1.00 | 0.81 | 1.00 | 0.76 | 1.00 |
| 12 | adaboost_model_base | 0.93 | 0.90 | 0.93 | 0.88 | 0.93 | 0.90 |
| 13 | adaboost_hyp_param_model | 1.00 | 0.91 | 1.00 | 0.92 | 1.00 | 0.91 |
| 14 | pipe_adaboost_model_with_hyp | 0.87 | 1.00 | 0.88 | 1.00 | 0.86 | 1.00 |
| 15 | pipe_adaboost_model_without_hyp | 0.84 | 0.93 | 0.82 | 0.94 | 0.85 | 0.92 |
| 16 | gradientboost_model_base | 0.99 | 0.95 | 0.99 | 0.95 | 0.99 | 0.96 |
| 17 | gradientboost_hyp_param_model | 1.00 | 0.97 | 1.00 | 0.97 | 1.00 | 0.97 |
| 18 | pipe_gradientboost_model_with_hyp | 0.90 | 1.00 | 0.91 | 1.00 | 0.90 | 1.00 |
| 19 | pipe_gradientboost_model_without_hyp | 0.89 | 1.00 | 0.88 | 1.00 | 0.89 | 0.99 |
Q6 B - Select the final best trained model along with your detailed comments for selecting this model.
sorted_results = comparison_frame.sort_values(by=['Train_Accuracy', 'Test_Accuracy'], ascending=[False, False])
best_model = sorted_results.iloc[0]
best_model_df = pd.DataFrame(best_model)
best_model_df
| 17 | |
|---|---|
| Model | gradientboost_hyp_param_model |
| Train_Accuracy | 1.0 |
| Test_Accuracy | 0.97 |
| Train_Recall | 1.0 |
| Test_Recall | 0.97 |
| Train_Precision | 1.0 |
| Test_Precision | 0.97 |
Among all the models, the Gradient Boosting model with hyperparameter adjustment performs the best. It predicts 97% of the test data correctly and has the best test accuracy (0.97). A high test recall of 0.97 means that 97% of the positive cases in the test data are accurately identified by it. The model is not overfitting the training data, as evidenced by the excellent train accuracy and precision.
Here are some thorough remarks for choosing this model:
Q6 C - Pickle the selected model for future use.
import pickle
with open(
"C:/Users/SAIF MERCHANT/Desktop/Great Learning/Featurization, Model Selection & Tuning/Project/gradientboost_hyp_param_model.pkl",
'wb') as model_file:
pickle.dump(gradientboost_hyp_param_model, model_file)
Q6 D - Write your conclusion on the results.